http://sebastianraschka.com/Articles/heatmaps_in_r.html
In this section we’ll continue using mpg and CRC dataset.
library(Biobase)
################################################################
# Read data
################################################################
CRC <- read.csv("./data/CRC_train.csv")
CRC.prot <- CRC[,1:72]
CRC.anno <- CRC[,73:79]
# Deal with missing value
# First option: remove the samples with missing values
dim(na.omit(CRC.prot))
## [1] 89 72
# Second option: impute the missing values
random.imp <- function (a){
missing <- is.na(a)
n.missing <- sum(missing)
a.obs <- a[!missing]
imputed <- a
# imputed[missing] <- sample(a.obs, n.missing, replace=TRUE)
imputed[missing] <- median(a.obs)
# imputed[missing] <- 0
return (imputed)
}
pMiss <- function(x){sum(is.na(x))/length(x)*100}
samplemissing <- apply(CRC.prot,1,pMiss)
# Only keep the samples with less than 5% missing values
selectedSamples <- which(samplemissing <= 5)
imputed.CRC.prot <- t(apply(CRC.prot[selectedSamples,], 1, function(x) random.imp(x)))
imputed.CRC.anno <- CRC.anno[selectedSamples,]
library(ggplot2)
library(tidyr)
library(RColorBrewer)
CRC.biomarkers <- CRC[,c("CP", "PON1", "SERPINA3", "LRG1", "TIMP1")]
cor.biomarker <- cor(CRC.biomarkers, use = "pairwise.complete.obs")
cor.biomarker <- as.data.frame(cor.biomarker)
cor.biomarker$Protein1 <- rownames(cor.biomarker)
cor.biomarker <- cor.biomarker %>% gather(Protein2, Correlation, -Protein1)
g <- ggplot(cor.biomarker, aes(Protein1, Protein2))
g + geom_tile(aes(fill=Correlation)) + scale_fill_gradientn(colours = brewer.pal(5,"Spectral"))
################################################################
################################################################
#################### H E A T M A P S ########################
# Iconic graphics for high-throughput data
################## CAUTION: HIDDEN PITFALLS ####################
################################################################
################################################################
################################################################
# Heatmap of individual values of proteins
################################################################
heatmap(t(imputed.CRC.prot))
?heatmap
# Change the font of row and column label
heatmap(t(imputed.CRC.prot), cexRow = 0.3, cexCol = 0.2, margins = c(2, 2))
# Don't do cluster on rows
heatmap(t(imputed.CRC.prot), cexRow = 0.3, cexCol = 0.2, margins = c(2, 2), Rowv = NA)
# Don't do cluster on columns
heatmap(t(imputed.CRC.prot), cexRow = 0.3, cexCol = 0.2, margins = c(2, 2), Colv = NA)
library(bioDist)
# toy example: 3 genes, 5 samples
gene1 <- c(1, 6, 2, 4, 7)
gene2 <- gene1 + 4
gene3 <- gene2/3 + c(0, 2, 0, 4, 0)
e <- rbind(gene1, gene2, gene3)
dimnames(e) <- list(paste("gene", 1:3, sep=""), paste("sample", 1:5, sep=""))
e
## sample1 sample2 sample3 sample4 sample5
## gene1 1.000000 6.000000 2 4.000000 7.000000
## gene2 5.000000 10.000000 6 8.000000 11.000000
## gene3 1.666667 5.333333 2 6.666667 3.666667
# plot
e1 <- as.data.frame(e)
e1$gene <- rownames(e1)
e1 <- e1 %>% gather(Sample, Abundance, -gene)
ggplot(e1) +
geom_line(aes(x=Sample, y = Abundance, group = gene, colour=gene))
# default: distance fnction eucledian
heatmap(e, distfun=euc, margins=c(10,10), main="Eucledian distance")
# correlation
heatmap(e, distfun=cor.dist, margins=c(10,10), main="Correlation distance")
genes are multivariate objects (=observations) in the space of samples (=variables) if we are not interested in similarity of abundance but in similarity of the direction of change, then it will help to standardize the observations across variables
now each gene has mean 0 and sample variance 1 across arrays
library(genefilter)
e_centerScale <- ( e - rowMeans(e) ) / rowSds(e)
e1 <- as.data.frame(e_centerScale)
e1$gene <- rownames(e1)
e1 <- e1 %>% gather(Sample, Abundance, -gene)
ggplot(e1) +
geom_line(aes(x=Sample, y = Abundance, group = gene, colour=gene))
# Eucledian distance is sensitive to centering and scaling
heatmap(e, distfun=euc, margins=c(10,10), main="Eucledian, raw")
heatmap(e_centerScale, distfun=euc, margins=c(10,10), main="Eucledian, centered and scaled")
# Correlation distance is not sensitive to centering and scaling
heatmap(e, distfun=cor.dist, margins=c(10,10), main="Correlation, raw")
heatmap(e_centerScale, distfun=cor.dist, margins=c(10,10), main="Correlation, centered and scaled")
Note that centering and scaling of genes across samples + correlation distance affect the order of the samples). My choice is to use correlation on the original values, or to compute dendrograms separately, save the order of genes or samples, and use “image” to visualize
However it only affects the color, and not the calculation of dendrograms. It’s usually better to make custom color definitions
# scaling of color matters for unscaled data
heatmap(e, distfun=euc, margins=c(10,10), scale="none", main="Eucledian, raw")
heatmap(e, distfun=euc, margins=c(10,10), main="Eucledian, scaled")
# Conclusion: genes 1 and 2 are far from each other on the dendrogram (because of Eucledian distance) but have same color patterns of intensities (because of scaled colors)
# scaling of color irrelevant if data themselves are scaled already
heatmap(e_centerScale, distfun=euc, margins=c(10,10), scale="none")
heatmap(e_centerScale, distfun=euc, margins=c(10,10))
Example with real dataset
# Default: raw data, scaled by row (i.e. gene)
heatmap(t(imputed.CRC.prot))
col<- colorRampPalette(c("red", "white", "blue"))(256)
heatmap(t(imputed.CRC.prot), col = col)
# change color code: non-standardized expressions
# equal-spaced breaks for values of abundance
summary(as.vector(t(imputed.CRC.prot))) # min ~ 6, max ~ 25
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.583 10.496 13.086 13.535 16.058 24.591
quantile.range <- quantile(imputed.CRC.prot, probs = seq(0, 1, 0.01))
palette.breaks <- seq(quantile.range["5%"], quantile.range["95%"], by = 0.5)
color.palette <- colorRampPalette(brewer.pal(11,"Spectral"))(length(palette.breaks) - 1)
heatmap(t(imputed.CRC.prot), col = color.palette, scale = "none", breaks= palette.breaks)
# change color code: protein-standardized expressions
# quantile-spaced breaks in positive and negative directions
imputed_CRC_prot_centerScale <- ( t(imputed.CRC.prot) - rowMeans(t(imputed.CRC.prot)) ) / rowSds(t(imputed.CRC.prot))
exprs_brk <- c( quantile(imputed_CRC_prot_centerScale[imputed_CRC_prot_centerScale < 0], probs=seq(0, 1, length=10)), 0, quantile(imputed_CRC_prot_centerScale[imputed_CRC_prot_centerScale > 0], seq(0, 1, length=10)))
colors <- colorRampPalette(brewer.pal(11,"Spectral"))(length(exprs_brk)-1)
heatmap(t(imputed.CRC.prot), col = colors, breaks = exprs_brk)
myColor <- rep("blue", nrow(imputed.CRC.prot))
myColor[imputed.CRC.anno$Sub_group == "CRC"] <- "red"
myColor[imputed.CRC.anno$Sub_group == "Healthy"] <- "yellow"
heatmap(t(imputed.CRC.prot), col = colors, breaks = exprs_brk, ColSideColors=myColor)
Distance options: euclidean (default), maximum, canberra, binary, minkowski, manhattan
Cluster options: complete (default), single, average, mcquitty, median, centroid, ward
# Correlation distance
heatmap(t(imputed.CRC.prot),
distfun = cor.dist,
col = colors,
breaks = exprs_brk,
ColSideColors=myColor,
Rowv = NA)
# Euclidean distance
# method "complete" by default
heatmap(t(imputed.CRC.prot),
distfun = euc,
col = colors,
breaks = exprs_brk,
ColSideColors= myColor,
Rowv = NA)
# clusters in the sample space: note the use of (t)
# single linkage
hc <- hclust(dist(imputed.CRC.prot), method="single" )
heatmap(t(imputed.CRC.prot),
col= colors,
breaks=exprs_brk,
ColSideColors=myColor,
Rowv = NA,
Colv = as.dendrogram(hc)) # apply default clustering method
# Average
hc <- hclust(dist(imputed.CRC.prot), method="average" )
heatmap(t(imputed.CRC.prot),
col = colors,
breaks = exprs_brk,
ColSideColors = myColor,
Rowv = NA,
Colv = as.dendrogram(hc)) # apply default clustering method
# Try manhattan distance
manhattan_dis <- dist(imputed.CRC.prot, method = "manhattan")
hc <- hclust(manhattan_dis)
heatmap(t(imputed.CRC.prot),
col = colors,
breaks = exprs_brk,
ColSideColors = myColor,
Rowv = NA,
Colv = as.dendrogram(hc)) # apply default clustering method
# Try maximum distance
maximum_dis <- dist(imputed.CRC.prot, method = "maximum")
hc <- hclust(maximum_dis)
heatmap(t(imputed.CRC.prot),
col = colors,
breaks = exprs_brk,
ColSideColors = myColor,
Rowv = NA,
Colv = as.dendrogram(hc)) # apply default clustering method
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
heatmap.2(t(imputed.CRC.prot), srtCol=45)